data.orig <- read_xls(path='default of credit card clients.xls',
sheet='Data', skip=1)
data <- data.frame(data.orig)
colnames(data) <- c('id', 'credit_limit', 'gender', 'education', 'marital', 'age', 'rs_sep', 'rs_aug', 'rs_july', 'rs_june', 'rs_may', 'rs_apr', 'bs_sep', 'bs_aug', 'bs_july', 'bs_june', 'bs_may', 'bs_apr', 'ap_sep', 'ap_aug', 'ap_july', 'ap_june', 'ap_may', 'ap_apr', 'default_r')
data <- data[data$marital!=0,]
# Marital status (1 = married; 2 = single; 3 = others)
data[data$marital == 2, 'marital'] <- 'Single'
data[data$marital == 1, 'marital'] <- 'Married'
data[data$marital == 3, 'marital'] <- 'Others'
marital.levels <- c("Single", "Married", "Others")
data$marital <- factor(data$marital, levels = marital.levels)
data[data$education >= 4 | data$education ==0, 'education'] <- 4
# Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)
data[data$education==1, 'education'] <- 'Graduate School'
data[data$education==2, 'education'] <- 'University'
data[data$education==3, 'education'] <- 'High School'
data[data$education==4, 'education'] <- 'Others'
education.levels <- c("Others", "High School", "Graduate School", "University")
data$education <- factor(data$education, levels=education.levels)
# Gender (1 = male; 2 = female)
data[data$gender==1, 'gender'] <- 'Male'
data[data$gender==2, 'gender'] <- 'Female'
gender_levels <- c("Male", "Female")
data$gender <- factor(data$gender, levels=gender_levels)
data$rs_sep <- factor(data$rs_sep)
data$rs_aug <- factor(data$rs_aug)
data$rs_july <- factor(data$rs_july)
data$rs_june <- factor(data$rs_june)
data$rs_may <- factor(data$rs_may)
data$rs_apr <- factor(data$rs_apr)
data[data$default_r==1, 'default_r'] <- 'Yes'
data[data$default_r==0, 'default_r'] <- 'No'
default_levels <- c("No", "Yes")
data$default_r <- factor(data$default_r, levels=default_levels)
In recent years, the credit card issuers in Taiwan faced the cash and credit card debt crisis and the delinquency is expected to peak in the third quarter of 2006 (Chou, 2006). In order to increase market share, card-issuing banks in Taiwan over-issued cash and credit cards to unqualified applicants. At the same time, most cardholders, irrespective of their repayment ability, overused credit card for consumption and accumulated heavy credit and cash– card debts. The crisis caused the blow to consumer finance confidence and it is a big challenge for both banks and cardholders.
str(data)
## 'data.frame': 29946 obs. of 25 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ credit_limit: num 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ gender : Factor w/ 2 levels "Male","Female": 2 2 2 2 1 1 1 2 2 1 ...
## $ education : Factor w/ 4 levels "Others","High School",..: 4 4 4 4 4 3 3 4 2 2 ...
## $ marital : Factor w/ 3 levels "Single","Married",..: 2 1 1 2 2 1 1 1 2 1 ...
## $ age : num 24 26 34 37 57 37 29 23 28 35 ...
## $ rs_sep : Factor w/ 11 levels "-2","-1","0",..: 5 2 3 3 2 3 3 3 3 1 ...
## $ rs_aug : Factor w/ 11 levels "-2","-1","0",..: 5 5 3 3 3 3 3 2 3 1 ...
## $ rs_july : Factor w/ 11 levels "-2","-1","0",..: 2 3 3 3 2 3 3 2 5 1 ...
## $ rs_june : Factor w/ 11 levels "-2","-1","0",..: 2 3 3 3 3 3 3 3 3 1 ...
## $ rs_may : Factor w/ 10 levels "-2","-1","0",..: 1 3 3 3 3 3 3 3 3 2 ...
## $ rs_apr : Factor w/ 10 levels "-2","-1","0",..: 1 4 3 3 3 3 3 2 3 2 ...
## $ bs_sep : num 3913 2682 29239 46990 8617 ...
## $ bs_aug : num 3102 1725 14027 48233 5670 ...
## $ bs_july : num 689 2682 13559 49291 35835 ...
## $ bs_june : num 0 3272 14331 28314 20940 ...
## $ bs_may : num 0 3455 14948 28959 19146 ...
## $ bs_apr : num 0 3261 15549 29547 19131 ...
## $ ap_sep : num 0 0 1518 2000 2000 ...
## $ ap_aug : num 689 1000 1500 2019 36681 ...
## $ ap_july : num 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ ap_june : num 0 1000 1000 1100 9000 ...
## $ ap_may : num 0 0 1000 1069 689 ...
## $ ap_apr : num 0 2000 5000 1000 679 ...
## $ default_r : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
rmarkdown::paged_table(data)
hist.default <- ggplot(data, aes(x=default_r)) +
geom_bar() +
xlab('Credit Card Default') +
ggtitle('Imbalanced distribution of response variable')
bxpt.default <- ggplot(data, aes(x=default_r, y=credit_limit)) +
geom_boxplot(outlier.color = 'red', outlier.shape = 8, fill='gray') +
stat_boxplot(geom='errorbar', width=0.5) + xlab('Default')
grid.arrange(hist.default, bxpt.default, nrow=1)
data <- SMOTE(default_r~., data=data, perc.over=280)
hist.default <- ggplot(data, aes(x=default_r)) +
geom_bar() +
xlab('Credit Card Default') +
ggtitle('Imbalanced distribution of response variable')
bxpt.default <- ggplot(data, aes(x=default_r, y=credit_limit)) +
geom_boxplot(outlier.color = 'red', outlier.shape = 8, fill='gray') +
stat_boxplot(geom='errorbar', width=0.5) + xlab('Default')
grid.arrange(hist.default, bxpt.default, nrow=1)
# H0: Credit limit of those who default is less than that of who pay promptly
# H1: Credit limit of those who default is higher than that of who pay promptly
ttest.credit_limit <- t.test(data[data$default_r=="Yes", 'credit_limit'],
mu=mean(data[data$default_r=="No", 'credit_limit']),
alternative='greater')
ttest.credit_limit
##
## One Sample t-test
##
## data: data[data$default_r == "Yes", "credit_limit"]
## t = -70.654, df = 19892, p-value = 1
## alternative hypothesis: true mean is greater than 179123.7
## 95 percent confidence interval:
## 124308.7 Inf
## sample estimates:
## mean of x
## 125555.8
hist.age <- ggplot(data, aes(x=age)) +
geom_histogram(color='white') +
geom_vline(xintercept=mean(data$age), color='blue', lwd=1.6) +
geom_vline(xintercept=median(data$age), color='red', lwd=1.6,
linetype='dashed')
ggtitle('Age distribution')
## $title
## [1] "Age distribution"
##
## attr(,"class")
## [1] "labels"
bxpt.age <- ggplot(data, aes(x=default_r, y=age)) +
geom_boxplot(outlier.color = 'red', outlier.shape = 8,
fill='gray') +
stat_boxplot(geom='errorbar', width=0.5) + xlab('Default')
grid.arrange(hist.age, bxpt.age, nrow=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ttest.age_default <- t.test(data[data$default_r=="Yes", 'age'],
data[data$default_r=="No", 'age'])
ttest.age_default
##
## Welch Two Sample t-test
##
## data: data[data$default_r == "Yes", "age"] and data[data$default_r == "No", "age"]
## t = 3.8077, df = 43272, p-value = 0.0001405
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1565759 0.4887748
## sample estimates:
## mean of x mean of y
## 35.74497 35.42230
The t.test on age between those defaulted on their
credit card bills and those who did not, returned a p.value
of 1.404827^{-4}. In simple words, the age does have a connection on
those who default.
# Group them so we can have an overall understanding of at which age the default
# happens
data$age_group = 0
data[data$age <= 35, "age_group"] <- "Young"
data[data$age >35 & data$age <= 50, "age_group"] <- "Young Adults"
data[data$age >50 & data$age < 66, "age_group"] <- "Mid-age Adults"
data[data$age >= 66, "age_group"] <- "Seniors"
data$age_group <- factor(data$age_group, levels=c("Young", "Young Adults",
"Mid-age Adults", "Seniors"))
age_group_default_cont <- table(data$default_r, data$age_group)
chisq.age_default <- chisq.test(age_group_default_cont)
age_group_default_cont <- data.frame(age_group_default_cont)
colnames(age_group_default_cont) <- c("Default", "Age_Group", "Freq")
bxpt.age_group_default <- ggplot(data, aes(x=age_group, y=age)) +
geom_boxplot(outlier.color='red', outlier.shape=8,
fill='gray') +
stat_boxplot(geom='errorbar', width=0.5)
bar.age_group_default <- ggplot(age_group_default_cont, aes(x=Age_Group, y=Freq, fill=Default)) +
geom_bar(stat='identity', position='identity', alpha=0.55)
grid.arrange(bxpt.age_group_default, bar.age_group_default, nrow=1)
# tmp_data <- data %>% group_by(age_group) %>% summarise(default_count = count(default_r))
chisq.age_default
##
## Pearson's Chi-squared test
##
## data: age_group_default_cont
## X-squared = 52.064, df = 3, p-value = 2.903e-11
educ_default_cont <- table(data$default, data$education)
chisq.educ_default <- chisq.test(educ_default_cont)
educ_default_cont <- data.frame(educ_default_cont)
colnames(educ_default_cont) <- c("Default", "Education", "Freq")
hist.educ <- ggplot(educ_default_cont, aes(x=Education, y=Freq, fill=Default)) +
geom_bar(stat='identity', position='identity', alpha=0.65)
hist.educ
chisq.educ_default
##
## Pearson's Chi-squared test
##
## data: educ_default_cont
## X-squared = 557.57, df = 3, p-value < 2.2e-16
# Marital status (1 = married; 2 = single; 3 = others)
# Remove rows that have marital 0 since it is not given any meaning
marital_default_cont <- table(data$default_r, data$marital)
chisq.marital_default <- chisq.test(marital_default_cont)
marital_default_cont <- data.frame(marital_default_cont)
colnames(marital_default_cont) <- c("Default", "Marital", "Freq")
ggplot(marital_default_cont, aes(x=Marital, y=Freq, fill=Default)) +
geom_bar(stat='identity', position='identity', alpha=0.65) + xlab('Marital')
# marital_default_cont
chisq.marital_default
##
## Pearson's Chi-squared test
##
## data: marital_default_cont
## X-squared = 142.61, df = 2, p-value < 2.2e-16
gender_default_cont <- table(data$gender, data$default_r)
chisq.gender_default <- chisq.test(gender_default_cont)
gender_default_cont <- data.frame(gender_default_cont)
colnames(gender_default_cont) <- c("Gender", "Default", "Freq")
hist.gender_default <- ggplot(gender_default_cont, aes(x=Gender, y=Freq, fill=Default)) +
geom_bar(stat='identity', position='identity', alpha=0.65)
hist.gender_default
chisq.gender_default
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: gender_default_cont
## X-squared = 442.19, df = 1, p-value < 2.2e-16
sep_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_sep, y=)) +
geom_bar(aes(y = (..count..)/sum(..count..))) + ylab('Density') +
ylim(c(0, 0.65)) +
ggtitle('Sep Default RP')
sep_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_sep)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) +
ggtitle('Sep Non-Default RP')
aug_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_aug)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('Aug Default RP')
aug_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_aug)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('Aug Non-Default RP')
july_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_july)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('July Default RP')
july_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_july)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('July Non-Default RP')
june_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_june)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('June Default RP')
june_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_june)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('June Non-Default RP')
may_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_may)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('May Default RP')
may_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_may)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('May Non-Default RP')
april_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_apr)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('April Default RP')
april_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_apr)) + ylab('Density') +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylim(c(0, 0.65)) + ggtitle('April Non-Default RP')
grid.arrange(april_df, april_ndf, may_df, may_ndf, june_df, june_ndf,
july_df, july_ndf, aug_df, aug_ndf, sep_df, sep_ndf, nrow=6, ncol=2 )
library(dplyr)
ag_due_summary <- data %>%
dplyr::group_by(age_group) %>%
dplyr::summarize(total_bill = median(bs_sep + bs_aug + bs_july + bs_june + bs_may + bs_apr),
total_paid = median(ap_sep + ap_aug + ap_july + ap_june + ap_may + ap_apr),
credit_limit = median(credit_limit))
ag_due_summary <- data.frame(ag_due_summary)
ag_due_summary$due <- ag_due_summary$total_bill - ag_due_summary$total_paid
ag_due_summary$due_percent <- (ag_due_summary$due/ag_due_summary$credit_limit) * 100
rmarkdown::paged_table(ag_due_summary)
med_due <- round(ag_due_summary$due_percent)
age_labels <- ag_due_summary$age_group
lbls <- paste(age_labels, ' ', med_due, '%')
pie(med_due, labels=lbls, col=rainbow(length(lbls)), main="Median Due by age groups")
getproportion <- function(x)
{
ul <- unique(x)
count <- NULL
for(e in ul) # Hoping it produces in the same order as stored in ul
{
count <- c(count, sum(x==e))
}
count <- (count/sum(count)) * 100
return(count)
# m_c <- max(count, na.rm=T)
# m_i <- which(count == m_c)
# return(ul[m_i])
}
education.young <- getproportion(data[data$age_group=='Young', 'education'])
education.young_adults <- getproportion(data[data$age_group=='Young Adults', 'education'])
education.midage_adults <- getproportion(data[data$age_group=='Mid-age Adults', 'education'])
education.seniors <- getproportion(data[data$age_group=='Seniors', 'education'])
young.educ_perc <- round(education.young)
young_adults.educ_perc <- round(education.young_adults)
midage_adults.educ_perc <- round(education.midage_adults)
seniors.educ_perc <- round(education.seniors)
# Young
lbls <- paste(education.levels, ' ', young.educ_perc, '%')
pie(young.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Young")
# Young Adults
lbls <- paste(education.levels, ' ', young_adults.educ_perc, '%')
pie(young.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Young Adults")
# Mid-age Adults
lbls <- paste(education.levels, ' ', midage_adults.educ_perc, '%')
pie(midage_adults.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Mid-age adults")
# Seniors
lbls <- paste(education.levels, ' ', seniors.educ_perc, '%')
pie(seniors.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Seniors")
full_model <- glm('default_r~.-id-age_group', data=data, family='binomial')
# full_model
summary(full_model)
##
## Call:
## glm(formula = "default_r~.-id-age_group", family = "binomial",
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5309 -0.7849 -0.5185 0.8487 4.0266
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.201e+00 1.429e-01 -8.404 < 2e-16 ***
## credit_limit -2.078e-06 1.220e-07 -17.030 < 2e-16 ***
## genderFemale -3.632e-01 2.280e-02 -15.929 < 2e-16 ***
## educationHigh School 1.449e+00 1.305e-01 11.102 < 2e-16 ***
## educationGraduate School 1.281e+00 1.294e-01 9.907 < 2e-16 ***
## educationUniversity 1.229e+00 1.289e-01 9.537 < 2e-16 ***
## maritalMarried 1.727e-01 2.427e-02 7.113 1.14e-12 ***
## maritalOthers 5.250e-01 9.649e-02 5.441 5.29e-08 ***
## age 2.136e-03 1.368e-03 1.562 0.118290
## rs_sep-1 3.904e-01 5.863e-02 6.658 2.77e-11 ***
## rs_sep0 -8.737e-02 6.014e-02 -1.453 0.146314
## rs_sep1 5.235e-01 5.202e-02 10.063 < 2e-16 ***
## rs_sep2 1.791e+00 6.537e-02 27.392 < 2e-16 ***
## rs_sep3 1.709e+00 1.226e-01 13.943 < 2e-16 ***
## rs_sep4 1.819e+00 2.358e-01 7.712 1.24e-14 ***
## rs_sep5 1.062e+00 3.729e-01 2.848 0.004406 **
## rs_sep6 -1.627e-01 5.308e-01 -0.307 0.759218
## rs_sep7 6.153e-01 5.825e-01 1.056 0.290836
## rs_sep8 1.699e+00 4.518e-01 3.759 0.000170 ***
## rs_aug-1 -1.033e-01 5.546e-02 -1.862 0.062539 .
## rs_aug0 -3.091e-01 5.969e-02 -5.178 2.24e-07 ***
## rs_aug1 -1.023e+00 3.900e-01 -2.624 0.008699 **
## rs_aug2 4.131e-01 5.807e-02 7.114 1.13e-12 ***
## rs_aug3 3.850e-01 1.142e-01 3.371 0.000748 ***
## rs_aug4 3.126e-02 1.989e-01 0.157 0.875102
## rs_aug5 -3.951e-02 4.013e-01 -0.098 0.921573
## rs_aug6 8.819e-01 5.056e-01 1.744 0.081104 .
## rs_aug7 6.131e-01 4.685e-01 1.309 0.190613
## rs_aug8 -1.496e+01 3.247e+02 -0.046 0.963255
## rs_july-1 -1.039e-01 5.392e-02 -1.928 0.053911 .
## rs_july0 -1.212e-01 5.566e-02 -2.178 0.029388 *
## rs_july1 -1.252e+01 1.700e+02 -0.074 0.941302
## rs_july2 4.004e-01 5.834e-02 6.863 6.73e-12 ***
## rs_july3 4.978e-01 1.342e-01 3.708 0.000209 ***
## rs_july4 3.820e-01 2.382e-01 1.604 0.108785
## rs_july5 -4.150e-02 4.385e-01 -0.095 0.924597
## rs_july6 9.590e-01 4.565e-01 2.101 0.035643 *
## rs_july7 1.140e+00 5.117e-01 2.229 0.025840 *
## rs_july8 -4.793e+00 1.423e+00 -3.368 0.000757 ***
## rs_june-1 -1.865e-01 5.293e-02 -3.523 0.000427 ***
## rs_june0 -2.035e-01 5.272e-02 -3.859 0.000114 ***
## rs_june1 1.392e+01 1.700e+02 0.082 0.934750
## rs_june2 4.349e-01 5.909e-02 7.361 1.82e-13 ***
## rs_june3 5.669e-01 1.525e-01 3.718 0.000200 ***
## rs_june4 7.078e-01 2.399e-01 2.950 0.003176 **
## rs_june5 1.026e-01 3.324e-01 0.309 0.757669
## rs_june6 -5.542e-01 8.787e-01 -0.631 0.528254
## rs_june7 1.186e+00 3.623e-01 3.274 0.001062 **
## rs_june8 1.159e+01 1.099e+02 0.105 0.916025
## rs_may-1 -1.599e-01 5.103e-02 -3.134 0.001726 **
## rs_may0 -1.940e-01 5.015e-02 -3.869 0.000109 ***
## rs_may2 4.508e-01 6.033e-02 7.473 7.82e-14 ***
## rs_may3 6.831e-01 1.607e-01 4.252 2.12e-05 ***
## rs_may4 6.373e-01 2.398e-01 2.657 0.007882 **
## rs_may5 9.030e-01 4.049e-01 2.230 0.025726 *
## rs_may6 -6.403e-01 1.715e+00 -0.373 0.708807
## rs_may7 1.054e+00 3.624e-01 2.907 0.003649 **
## rs_may8 4.982e+00 1.471e+02 0.034 0.972982
## rs_apr-1 -4.350e-02 4.571e-02 -0.952 0.341298
## rs_apr0 -2.829e-01 4.538e-02 -6.234 4.53e-10 ***
## rs_apr2 3.814e-01 5.526e-02 6.901 5.17e-12 ***
## rs_apr3 1.213e+00 1.622e-01 7.481 7.36e-14 ***
## rs_apr4 2.918e-01 2.858e-01 1.021 0.307374
## rs_apr5 1.128e+00 6.102e-01 1.849 0.064423 .
## rs_apr6 1.187e+00 4.497e-01 2.639 0.008317 **
## rs_apr7 8.686e-01 3.820e-01 2.274 0.022977 *
## rs_apr8 1.022e+01 1.130e+02 0.090 0.927931
## bs_sep -3.144e-06 8.363e-07 -3.759 0.000171 ***
## bs_aug 3.238e-06 1.110e-06 2.916 0.003544 **
## bs_july 1.952e-06 1.041e-06 1.875 0.060778 .
## bs_june 3.817e-07 1.083e-06 0.353 0.724399
## bs_may -1.001e-06 1.267e-06 -0.790 0.429612
## bs_apr 1.266e-06 1.016e-06 1.246 0.212662
## ap_sep -1.622e-05 1.844e-06 -8.795 < 2e-16 ***
## ap_aug -1.160e-05 1.610e-06 -7.204 5.83e-13 ***
## ap_july -4.031e-06 1.427e-06 -2.824 0.004743 **
## ap_june -4.994e-06 1.501e-06 -3.327 0.000879 ***
## ap_may -7.142e-06 1.471e-06 -4.855 1.20e-06 ***
## ap_apr -3.449e-06 1.095e-06 -3.150 0.001635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 63397 on 46416 degrees of freedom
## Residual deviance: 47819 on 46338 degrees of freedom
## AIC: 47977
##
## Number of Fisher Scoring iterations: 11
get_cm <- function(predictions, model, co, g_truth)
{
# pred.list <- round(predictions) # for now
pred.list <- predictions
pred.list[predictions <= co] <- 1
pred.list[predictions > co] <- 2
cm <- matrix(0, nrow=2, ncol=2)
for(i in 1:length(pred.list))
{
actual <- g_truth[i]
pred <- pred.list[i]
cm[pred, actual] <- cm[pred, actual] + 1
}
colnames(cm) <- c('Act.Pos', 'Act.Neg')
return(cm)
}
get_evalmetrics <- function(cm, model, n_preds)
{
precision <- cm[1, 1]/(cm[1, 1] + cm[1, 2])
recall <- cm[1, 1]/(cm[1, 1] + cm[2, 1])
sensitivity <- recall
specificity <- cm[2, 2]/(cm[2, 2] + cm[1, 2])
f1 <- 2 * ((precision * recall)/(precision + recall))
mcfadden <- 1 - model$deviance/model$null.deviance
accuracy <- (cm[1, 1] + cm[2, 2])/n_preds
metrics <- c(precision, recall, sensitivity, specificity, f1, mcfadden, accuracy)
return(metrics)
}
fm.y_prob <- predict.glm(full_model, newdata=data, type=c("response"))
tmp_cm <- get_cm(fm.y_prob, full_model, 0.5, data$default_r)
tmp_metrics <- get_evalmetrics(tmp_cm, full_model, length(fm.y_prob))
tmp_cm
## Act.Pos Act.Neg
## [1,] 23109 7575
## [2,] 3415 12318
tmp_metrics
## Act.Pos Act.Pos Act.Pos Act.Neg Act.Pos Act.Pos
## 0.7531287 0.8712487 0.8712487 0.6192128 0.8078940 0.2457158 0.7632333
exp_coeff <- exp(full_model$coefficients)
exp_coeff
## (Intercept) credit_limit genderFemale
## 3.009243e-01 9.999979e-01 6.954767e-01
## educationHigh School educationGraduate School educationUniversity
## 4.257172e+00 3.601965e+00 3.419274e+00
## maritalMarried maritalOthers age
## 1.188459e+00 1.690477e+00 1.002139e+00
## rs_sep-1 rs_sep0 rs_sep1
## 1.477527e+00 9.163422e-01 1.687897e+00
## rs_sep2 rs_sep3 rs_sep4
## 5.993234e+00 5.522436e+00 6.162932e+00
## rs_sep5 rs_sep6 rs_sep7
## 2.891303e+00 8.498445e-01 1.850197e+00
## rs_sep8 rs_aug-1 rs_aug0
## 5.466350e+00 9.018615e-01 7.341167e-01
## rs_aug1 rs_aug2 rs_aug3
## 3.594014e-01 1.511536e+00 1.469680e+00
## rs_aug4 rs_aug5 rs_aug6
## 1.031756e+00 9.612567e-01 2.415572e+00
## rs_aug7 rs_aug8 rs_july-1
## 1.846198e+00 3.181072e-07 9.012790e-01
## rs_july0 rs_july1 rs_july2
## 8.858127e-01 3.652197e-06 1.492451e+00
## rs_july3 rs_july4 rs_july5
## 1.645091e+00 1.465233e+00 9.593516e-01
## rs_july6 rs_july7 rs_july8
## 2.609069e+00 3.127883e+00 8.288328e-03
## rs_june-1 rs_june0 rs_june1
## 8.298782e-01 8.158909e-01 1.110380e+06
## rs_june2 rs_june3 rs_june4
## 1.544884e+00 1.762782e+00 2.029451e+00
## rs_june5 rs_june6 rs_june7
## 1.108020e+00 5.745553e-01 3.274081e+00
## rs_june8 rs_may-1 rs_may0
## 1.076623e+05 8.522213e-01 8.236293e-01
## rs_may2 rs_may3 rs_may4
## 1.569624e+00 1.980032e+00 1.891351e+00
## rs_may5 rs_may6 rs_may7
## 2.466964e+00 5.271105e-01 2.868097e+00
## rs_may8 rs_apr-1 rs_apr0
## 1.457459e+02 9.574337e-01 7.535849e-01
## rs_apr2 rs_apr3 rs_apr4
## 1.464261e+00 3.364797e+00 1.338798e+00
## rs_apr5 rs_apr6 rs_apr7
## 3.090482e+00 3.276317e+00 2.383550e+00
## rs_apr8 bs_sep bs_aug
## 2.746195e+04 9.999969e-01 1.000003e+00
## bs_july bs_june bs_may
## 1.000002e+00 1.000000e+00 9.999990e-01
## bs_apr ap_sep ap_aug
## 1.000001e+00 9.999838e-01 9.999884e-01
## ap_july ap_june ap_may
## 9.999960e-01 9.999950e-01 9.999929e-01
## ap_apr
## 9.999966e-01
Based on the intercept value 0.3009243 a person is less likely to default on average.